Domain expert want to understand major trends and insights about NYC crime/violence from several public data source to enable the police officers on patrol. This report analyzes and looks for trends that “beat officers” can use in their daily operations.
#load libraries
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages --------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.2 v purrr 0.3.4
v tibble 3.0.3 v dplyr 1.0.0
v tidyr 1.1.0 v stringr 1.4.0
v readr 1.3.1 v forcats 0.5.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(leaflet)
package 㤼㸱leaflet㤼㸲 was built under R version 4.0.3Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
#adjust options
options(scipen = 666) # remove scientific numbers
# Read Data from internet
# df_arrests_hist <-read_csv("https://data.cityofnewyork.us/api/views/8h9b-rp9u/rows.csv?accessType=DOWNLOAD")%>%
# janitor::clean_names()
df_arrests_hist <-read_csv("df_arrests_hist.csv")%>%
janitor::clean_names()
Parsed with column specification:
cols(
arrest_key = col_double(),
arrest_date = col_character(),
pd_cd = col_double(),
pd_desc = col_character(),
ky_cd = col_double(),
ofns_desc = col_character(),
law_code = col_character(),
law_cat_cd = col_character(),
arrest_boro = col_character(),
arrest_precinct = col_double(),
jurisdiction_code = col_double(),
age_group = col_character(),
perp_sex = col_character(),
perp_race = col_character(),
x_coord_cd = col_double(),
y_coord_cd = col_double(),
latitude = col_double(),
longitude = col_double(),
lon_lat = col_character()
)
df_arrests_ytd <- read_csv("https://data.cityofnewyork.us/api/views/uip8-fykc/rows.csv?accessType=DOWNLOAD")%>%
janitor::clean_names()
Parsed with column specification:
cols(
ARREST_KEY = col_double(),
ARREST_DATE = col_character(),
PD_CD = col_double(),
PD_DESC = col_character(),
KY_CD = col_double(),
OFNS_DESC = col_character(),
LAW_CODE = col_character(),
LAW_CAT_CD = col_character(),
ARREST_BORO = col_character(),
ARREST_PRECINCT = col_double(),
JURISDICTION_CODE = col_double(),
AGE_GROUP = col_character(),
PERP_SEX = col_character(),
PERP_RACE = col_character(),
X_COORD_CD = col_double(),
Y_COORD_CD = col_double(),
Latitude = col_double(),
Longitude = col_double(),
`New Georeferenced Column` = col_character()
)
df_shooting <- read_csv("https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD") %>%
janitor::clean_names()
Parsed with column specification:
cols(
INCIDENT_KEY = col_double(),
OCCUR_DATE = col_character(),
OCCUR_TIME = col_time(format = ""),
BORO = col_character(),
PRECINCT = col_double(),
JURISDICTION_CODE = col_double(),
LOCATION_DESC = col_character(),
STATISTICAL_MURDER_FLAG = col_logical(),
PERP_AGE_GROUP = col_character(),
PERP_SEX = col_character(),
PERP_RACE = col_character(),
VIC_AGE_GROUP = col_character(),
VIC_SEX = col_character(),
VIC_RACE = col_character(),
X_COORD_CD = col_number(),
Y_COORD_CD = col_number(),
Latitude = col_double(),
Longitude = col_double(),
Lon_Lat = col_character()
)
glimpse(df_arrests_hist)
Rows: 5,012,956
Columns: 19
$ arrest_key <dbl> 144026181, 144507595, 144565062, 144500188, 144216044, 144925030,...
$ arrest_date <chr> "06/26/2015", "07/14/2015", "07/16/2015", "07/14/2015", "07/03/20...
$ pd_cd <dbl> 639, 969, 101, 879, 478, 339, 849, 203, 511, 511, 750, 339, 847, ...
$ pd_desc <chr> "AGGRAVATED HARASSMENT 2", "TRAFFIC,UNCLASSIFIED INFRACTION", "AS...
$ ky_cd <dbl> 361, 881, 344, 675, 343, 341, 677, 352, 235, 235, 359, 341, 125, ...
$ ofns_desc <chr> "OFF. AGNST PUB ORD SENSBLTY & RGHTS TO PRIV", "OTHER TRAFFIC INF...
$ law_code <chr> "PL 2403002", "VTL051101A", "PL 1200001", "AC 010125B", "PL 16515...
$ law_cat_cd <chr> "M", "M", "M", "V", "M", "M", "V", "M", "M", "M", "M", "M", "F", ...
$ arrest_boro <chr> "Q", "M", "K", "Q", "M", "B", "K", "B", "B", "Q", "Q", "M", "M", ...
$ arrest_precinct <dbl> 102, 10, 90, 103, 10, 45, 78, 47, 52, 115, 111, 30, 5, 110, 48, 1...
$ jurisdiction_code <dbl> 0, 3, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0,...
$ age_group <chr> "45-64", "25-44", "18-24", "25-44", "18-24", "18-24", "18-24", "2...
$ perp_sex <chr> "M", "M", "F", "M", "M", "M", "M", "M", "F", "M", "M", "F", "M", ...
$ perp_race <chr> "WHITE HISPANIC", "WHITE HISPANIC", "WHITE HISPANIC", "WHITE HISP...
$ x_coord_cd <dbl> 1031076, 984791, 994026, 1037132, 984602, 1030990, 991330, 102801...
$ y_coord_cd <dbl> 193779, 209846, 195548, 196129, 210686, 255310, 187303, 262766, 2...
$ latitude <dbl> 40.69844, 40.74266, 40.70341, 40.70486, 40.74497, 40.86733, 40.68...
$ longitude <dbl> -73.83113, -73.99805, -73.96474, -73.80927, -73.99873, -73.83101,...
$ lon_lat <chr> "POINT (-73.83112953899997 40.69843969400005)", "POINT (-73.99804...
glimpse(df_arrests_ytd)
Rows: 103,376
Columns: 19
$ arrest_key <dbl> 208368444, 209487362, 208853853, 209621232, 208350647, 207...
$ arrest_date <chr> "01/22/2020", "02/13/2020", "02/01/2020", "02/16/2020", "0...
$ pd_cd <dbl> 729, 101, 779, 905, 259, 339, 101, 168, 779, 105, 969, 439...
$ pd_desc <chr> "FORGERY,ETC.,UNCLASSIFIED-FELO", "ASSAULT 3", "PUBLIC ADM...
$ ky_cd <dbl> 113, 344, 126, 347, 351, 341, 344, 116, 126, 106, 881, 109...
$ ofns_desc <chr> "FORGERY", "ASSAULT 3 & RELATED OFFENSES", "MISCELLANEOUS ...
$ law_code <chr> "PL 1702500", "PL 1200001", "PL 215510B", "VTL11920U2", "P...
$ law_cat_cd <chr> "F", "M", "F", "M", "M", "M", "M", "F", "F", "F", "M", "F"...
$ arrest_boro <chr> "K", "B", "Q", "B", "B", "K", "K", "K", "Q", "Q", "K", "Q"...
$ arrest_precinct <dbl> 69, 46, 115, 45, 43, 63, 90, 77, 103, 106, 60, 100, 46, 40...
$ jurisdiction_code <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0...
$ age_group <chr> "25-44", "45-64", "65+", "45-64", "18-24", "18-24", "45-64...
$ perp_sex <chr> "M", "M", "M", "M", "F", "M", "M", "M", "F", "F", "M", "M"...
$ perp_race <chr> "BLACK", "BLACK", "WHITE HISPANIC", "WHITE HISPANIC", "BLA...
$ x_coord_cd <dbl> 1012161, 1011751, 1015995, 1032817, 1020183, 1003772, 1003...
$ y_coord_cd <dbl> 178176, 250275, 212787, 257638, 239283, 161309, 197201, 18...
$ latitude <dbl> 40.65569, 40.85359, 40.75068, 40.87371, 40.82339, 40.60942...
$ longitude <dbl> -73.89941, -73.90059, -73.88543, -73.82439, -73.87017, -73...
$ new_georeferenced_column <chr> "POINT (-73.89940857299997 40.655692874000074)", "POINT (-...
glimpse(df_shooting)
Rows: 21,626
Columns: 19
$ incident_key <dbl> 74146165, 66928846, 29114164, 85180336, 73405770, 33397043,...
$ occur_date <chr> "08/14/2010", "10/17/2009", "05/18/2007", "06/09/2012", "06...
$ occur_time <time> 03:11:00, 18:03:00, 23:00:00, 17:15:00, 04:14:00, 23:05:00...
$ boro <chr> "QUEENS", "BROOKLYN", "BROOKLYN", "BROOKLYN", "BRONX", "QUE...
$ precinct <dbl> 113, 67, 75, 81, 47, 110, 114, 113, 113, 43, 40, 110, 40, 6...
$ jurisdiction_code <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ location_desc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ statistical_murder_flag <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
$ perp_age_group <chr> NA, NA, NA, NA, NA, NA, "25-44", NA, NA, NA, NA, NA, NA, NA...
$ perp_sex <chr> NA, NA, NA, NA, NA, NA, "M", NA, NA, NA, NA, NA, NA, NA, "M...
$ perp_race <chr> NA, NA, NA, NA, NA, NA, "BLACK", NA, NA, NA, NA, NA, NA, NA...
$ vic_age_group <chr> "25-44", "45-64", "25-44", "25-44", "25-44", "18-24", "25-4...
$ vic_sex <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M",...
$ vic_race <chr> "BLACK", "BLACK", "BLACK", "BLACK", "BLACK", "BLACK", "BLAC...
$ x_coord_cd <dbl> 1046573, 1003313, 1016292, 1005597, 1023551, 1015948, 10039...
$ y_coord_cd <dbl> 183057, 176413, 176228, 188673, 263366, 210428, 214344, 182...
$ latitude <dbl> 40.66891, 40.65088, 40.65033, 40.68452, 40.88947, 40.74420,...
$ longitude <dbl> -73.77534, -73.93130, -73.88453, -73.92303, -73.85786, -73....
$ lon_lat <chr> "POINT (-73.77534099699994 40.66891477200004)", "POINT (-73...
summary(df_arrests_hist)
arrest_key arrest_date pd_cd pd_desc ky_cd
Min. : 9926901 Length:5012956 Min. : 0.0 Length:5012956 Min. :101.0
1st Qu.: 59318539 Class :character 1st Qu.:293.0 Class :character 1st Qu.:126.0
Median : 83458938 Mode :character Median :511.0 Mode :character Median :341.0
Mean : 95791494 Mean :511.7 Mean :301.6
3rd Qu.:143546667 3rd Qu.:750.0 3rd Qu.:348.0
Max. :206893600 Max. :997.0 Max. :995.0
NA's :261 NA's :9029
ofns_desc law_code law_cat_cd arrest_boro arrest_precinct
Length:5012956 Length:5012956 Length:5012956 Length:5012956 Min. : 1.00
Class :character Class :character Class :character Class :character 1st Qu.: 33.00
Mode :character Mode :character Mode :character Mode :character Median : 60.00
Mean : 60.62
3rd Qu.: 84.00
Max. :123.00
jurisdiction_code age_group perp_sex perp_race x_coord_cd
Min. : 0.000 Length:5012956 Length:5012956 Length:5012956 Min. : 913357
1st Qu.: 0.000 Class :character Class :character Class :character 1st Qu.: 993370
Median : 0.000 Mode :character Mode :character Mode :character Median :1004890
Mean : 1.305 Mean :1005357
3rd Qu.: 0.000 3rd Qu.:1015835
Max. :97.000 Max. :1067302
NA's :10 NA's :1
y_coord_cd latitude longitude lon_lat
Min. : 121131 Min. :40.50 Min. :-74.25 Length:5012956
1st Qu.: 186886 1st Qu.:40.68 1st Qu.:-73.97 Class :character
Median : 209491 Median :40.74 Median :-73.93 Mode :character
Mean : 214966 Mean :40.76 Mean :-73.92
3rd Qu.: 236614 3rd Qu.:40.82 3rd Qu.:-73.89
Max. :8202360 Max. :62.08 Max. :-73.68
NA's :1 NA's :1 NA's :1
summary(df_arrests_ytd)
arrest_key arrest_date pd_cd pd_desc ky_cd
Min. :206890919 Length:103376 Min. : 0.0 Length:103376 Min. :101.0
1st Qu.:209754714 Class :character 1st Qu.:113.0 Class :character 1st Qu.:109.0
Median :212415780 Mode :character Median :339.0 Mode :character Median :235.0
Mean :212534707 Mean :401.4 Mean :241.9
3rd Qu.:215249521 3rd Qu.:639.0 3rd Qu.:344.0
Max. :218599667 Max. :969.0 Max. :995.0
NA's :14 NA's :24
ofns_desc law_code law_cat_cd arrest_boro arrest_precinct
Length:103376 Length:103376 Length:103376 Length:103376 Min. : 1.00
Class :character Class :character Class :character Class :character 1st Qu.: 40.00
Mode :character Mode :character Mode :character Mode :character Median : 62.00
Mean : 63.34
3rd Qu.:101.00
Max. :123.00
jurisdiction_code age_group perp_sex perp_race x_coord_cd
Min. : 0.000 Length:103376 Length:103376 Length:103376 Min. : 914321
1st Qu.: 0.000 Class :character Class :character Class :character 1st Qu.: 992254
Median : 0.000 Mode :character Mode :character Mode :character Median :1005312
Mean : 1.555 Mean :1005757
3rd Qu.: 0.000 3rd Qu.:1017440
Max. :97.000 Max. :1067185
y_coord_cd latitude longitude new_georeferenced_column
Min. :121131 Min. :40.50 Min. :-74.25 Length:103376
1st Qu.:185601 1st Qu.:40.68 1st Qu.:-73.97 Class :character
Median :206647 Median :40.73 Median :-73.92 Mode :character
Mean :208155 Mean :40.74 Mean :-73.92
3rd Qu.:236150 3rd Qu.:40.81 3rd Qu.:-73.88
Max. :271820 Max. :40.91 Max. :-73.70
summary(df_shooting)
incident_key occur_date occur_time boro precinct
Min. : 9953245 Length:21626 Length:21626 Length:21626 Min. : 1.00
1st Qu.: 51709091 Class :character Class1:hms Class :character 1st Qu.: 44.00
Median : 80495596 Mode :character Class2:difftime Mode :character Median : 69.00
Mean : 92011670 Mode :numeric Mean : 66.23
3rd Qu.:140791304 3rd Qu.: 81.00
Max. :206891917 Max. :123.00
jurisdiction_code location_desc statistical_murder_flag perp_age_group
Min. :0.000 Length:21626 Mode :logical Length:21626
1st Qu.:0.000 Class :character FALSE:17499 Class :character
Median :0.000 Mode :character TRUE :4127 Mode :character
Mean :0.327
3rd Qu.:0.000
Max. :2.000
NA's :2
perp_sex perp_race vic_age_group vic_sex vic_race
Length:21626 Length:21626 Length:21626 Length:21626 Length:21626
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
x_coord_cd y_coord_cd latitude longitude lon_lat
Min. : 914928 Min. :125757 Min. :40.51 Min. :-74.25 Length:21626
1st Qu.: 999925 1st Qu.:182658 1st Qu.:40.67 1st Qu.:-73.94 Class :character
Median :1007645 Median :193486 Median :40.70 Median :-73.92 Mode :character
Mean :1009370 Mean :207382 Mean :40.74 Mean :-73.91
3rd Qu.:1016865 3rd Qu.:239187 3rd Qu.:40.82 3rd Qu.:-73.88
Max. :1066815 Max. :271128 Max. :40.91 Max. :-73.70
df_arrests <- df_arrests_hist %>%
bind_rows(df_arrests_ytd)%>%
distinct() %>%
drop_na(ofns_desc)
glimpse(df_arrests)
Rows: 5,107,279
Columns: 20
$ arrest_key <dbl> 144026181, 144507595, 144565062, 144500188, 144216044, 144...
$ arrest_date <chr> "06/26/2015", "07/14/2015", "07/16/2015", "07/14/2015", "0...
$ pd_cd <dbl> 639, 969, 101, 879, 478, 339, 849, 203, 511, 511, 750, 339...
$ pd_desc <chr> "AGGRAVATED HARASSMENT 2", "TRAFFIC,UNCLASSIFIED INFRACTIO...
$ ky_cd <dbl> 361, 881, 344, 675, 343, 341, 677, 352, 235, 235, 359, 341...
$ ofns_desc <chr> "OFF. AGNST PUB ORD SENSBLTY & RGHTS TO PRIV", "OTHER TRAF...
$ law_code <chr> "PL 2403002", "VTL051101A", "PL 1200001", "AC 010125B", "P...
$ law_cat_cd <chr> "M", "M", "M", "V", "M", "M", "V", "M", "M", "M", "M", "M"...
$ arrest_boro <chr> "Q", "M", "K", "Q", "M", "B", "K", "B", "B", "Q", "Q", "M"...
$ arrest_precinct <dbl> 102, 10, 90, 103, 10, 45, 78, 47, 52, 115, 111, 30, 5, 110...
$ jurisdiction_code <dbl> 0, 3, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0...
$ age_group <chr> "45-64", "25-44", "18-24", "25-44", "18-24", "18-24", "18-...
$ perp_sex <chr> "M", "M", "F", "M", "M", "M", "M", "M", "F", "M", "M", "F"...
$ perp_race <chr> "WHITE HISPANIC", "WHITE HISPANIC", "WHITE HISPANIC", "WHI...
$ x_coord_cd <dbl> 1031076, 984791, 994026, 1037132, 984602, 1030990, 991330,...
$ y_coord_cd <dbl> 193779, 209846, 195548, 196129, 210686, 255310, 187303, 26...
$ latitude <dbl> 40.69844, 40.74266, 40.70341, 40.70486, 40.74497, 40.86733...
$ longitude <dbl> -73.83113, -73.99805, -73.96474, -73.80927, -73.99873, -73...
$ lon_lat <chr> "POINT (-73.83112953899997 40.69843969400005)", "POINT (-7...
$ new_georeferenced_column <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
df <- df_arrests %>%
rename(incident_key=arrest_key) %>%
rename(occur_date = arrest_date) %>%
bind_rows(df_shooting)
glimpse(df)
Rows: 5,128,905
Columns: 29
$ incident_key <dbl> 144026181, 144507595, 144565062, 144500188, 144216044, 144...
$ occur_date <chr> "06/26/2015", "07/14/2015", "07/16/2015", "07/14/2015", "0...
$ pd_cd <dbl> 639, 969, 101, 879, 478, 339, 849, 203, 511, 511, 750, 339...
$ pd_desc <chr> "AGGRAVATED HARASSMENT 2", "TRAFFIC,UNCLASSIFIED INFRACTIO...
$ ky_cd <dbl> 361, 881, 344, 675, 343, 341, 677, 352, 235, 235, 359, 341...
$ ofns_desc <chr> "OFF. AGNST PUB ORD SENSBLTY & RGHTS TO PRIV", "OTHER TRAF...
$ law_code <chr> "PL 2403002", "VTL051101A", "PL 1200001", "AC 010125B", "P...
$ law_cat_cd <chr> "M", "M", "M", "V", "M", "M", "V", "M", "M", "M", "M", "M"...
$ arrest_boro <chr> "Q", "M", "K", "Q", "M", "B", "K", "B", "B", "Q", "Q", "M"...
$ arrest_precinct <dbl> 102, 10, 90, 103, 10, 45, 78, 47, 52, 115, 111, 30, 5, 110...
$ jurisdiction_code <dbl> 0, 3, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0...
$ age_group <chr> "45-64", "25-44", "18-24", "25-44", "18-24", "18-24", "18-...
$ perp_sex <chr> "M", "M", "F", "M", "M", "M", "M", "M", "F", "M", "M", "F"...
$ perp_race <chr> "WHITE HISPANIC", "WHITE HISPANIC", "WHITE HISPANIC", "WHI...
$ x_coord_cd <dbl> 1031076, 984791, 994026, 1037132, 984602, 1030990, 991330,...
$ y_coord_cd <dbl> 193779, 209846, 195548, 196129, 210686, 255310, 187303, 26...
$ latitude <dbl> 40.69844, 40.74266, 40.70341, 40.70486, 40.74497, 40.86733...
$ longitude <dbl> -73.83113, -73.99805, -73.96474, -73.80927, -73.99873, -73...
$ lon_lat <chr> "POINT (-73.83112953899997 40.69843969400005)", "POINT (-7...
$ new_georeferenced_column <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ occur_time <time> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ boro <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ precinct <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ location_desc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ statistical_murder_flag <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ perp_age_group <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ vic_age_group <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ vic_sex <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ vic_race <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
df%>%
count(ofns_desc,sort = T)
NA
It seems that arrests for dangerous drugs are on a decline.
df %>%
filter(ofns_desc == "DANGEROUS DRUGS") %>%
mutate(
occur_date = lubridate::mdy(occur_date),
year = lubridate::year(occur_date),
year = factor(year),
month = lubridate:: month(occur_date),
month = factor(month))%>%
group_by(occur_date)%>%
mutate(
count_daily = n())%>%
ungroup()%>%
ggplot(aes(factor(year), count_daily,
fill = year, color = year
)) +
geom_boxplot(alpha = 0.2, size = 1, show.legend = FALSE) +
labs(x = NULL, y = "Daily Drug Arrests")
# The palette with grey:
df %>%
filter(ofns_desc == "ASSAULT 3 & RELATED OFFENSES") %>%
mutate(
occur_date = lubridate::mdy(occur_date),
year = lubridate::year(occur_date),
year = factor(year),
month = lubridate:: month(occur_date),
month = factor(month))%>%
group_by(occur_date)%>%
mutate(
count_daily = n())%>%
ungroup()%>%
ggplot(aes(year, count_daily,
fill = year, color = year, group =year)) +
geom_boxplot(alpha = 0.2, size = 1, show.legend = FALSE)+
labs(x = NULL, y = "Daily Assault Arrests")
df1 <- df1 %>%
mutate(
crime_type = case_when(
str_detect(ofns_desc, "ASSAULT")~"violent",
str_detect(ofns_desc, "shooting")~"violent",
str_detect(ofns_desc, "HOMICIDE")~"violent",
str_detect(ofns_desc, "MURDER")~"violent",
str_detect(ofns_desc, "RAPE")~"violent",
str_detect(ofns_desc, "FORC")~"violent",
str_detect(ofns_desc, "SEX")~"violent",
str_detect(ofns_desc, "KID")~"violent",
str_detect(ofns_desc, "WEAP")~"violent",
str_detect(ofns_desc, "DANG")~"violent",
str_detect(ofns_desc, "HARASSMENT")~"violent",
str_detect(ofns_desc, "THEFT")~"theft_property",
str_detect(ofns_desc, "LARCENY")~"theft_property",
str_detect(ofns_desc, "ARSON")~"theft_property",
str_detect(ofns_desc, "ENTRY")~"theft_property",
str_detect(ofns_desc, "FRAUD")~"theft_property",
str_detect(ofns_desc, "VEHICLE")~"theft_property",
str_detect(ofns_desc, "ROBBERY")~"theft_property",
str_detect(ofns_desc, "BURGLAR")~"theft_property",
str_detect(ofns_desc, "FORG")~"theft_property",
str_detect(ofns_desc, "PROP")~"theft_property",
str_detect(ofns_desc, "PARK")~"theft_property",
str_detect(ofns_desc, "DRIVE")~"theft_property",
TRUE~"other"
),
crime_type=factor(crime_type)
)
df1 %>%
select(crime_type) %>%
n_distinct()
[1] 3
df1 %>%
filter(crime_type == "violent") %>%
group_by(year_mon) %>%
summarize(count = n()) %>%
ungroup() %>%
ggplot(aes(year_mon,count))+
geom_point()+
geom_line()
`summarise()` ungrouping output (override with `.groups` argument)
NA
NA
df_weather <- read_csv("https://www.ncei.noaa.gov/orders/cdo/2353296.csv") %>%
janitor::clean_names() %>%
select(date,awnd,prcp,snow,snwd,tmax,tmin,wsf2,wt01,wt02,wt03,wt04,wt06,wt08) %>%
replace_na(list(wt01=0,wt02=0,wt03=0,wt04=0,wt06=0,wt08=0))
Parsed with column specification:
cols(
.default = col_double(),
STATION = col_character(),
NAME = col_character(),
DATE = col_date(format = ""),
PGTM = col_logical(),
TAVG = col_logical(),
TSUN = col_logical(),
WT03 = col_logical()
)
See spec(...) for full column specifications.
1 parsing failure.
row col expected actual file
1096 PGTM 1/0/T/F/TRUE/FALSE 1509 'https://www.ncei.noaa.gov/orders/cdo/2353296.csv'
df_holiday <- read_csv("usholidays.csv") %>%
janitor::clean_names() %>%
mutate(date = lubridate::mdy(date))
Parsed with column specification:
cols(
Date = col_character(),
Holiday = col_character()
)
mean(df_weather$awnd,na.rm = T)
[1] 5.05651
df1 <-df1 %>%
inner_join(df_weather,by = c("occur_date"="date")) %>%
left_join(df_holiday, by = c("occur_date" = "date")) %>%
mutate(holiday = ifelse(is.na(holiday),0,1))
glimpse(df1)
Rows: 1,507,038
Columns: 40
$ incident_key <dbl> 144026181, 144507595, 144565062, 144500188, 144216044, 1449...
$ occur_date <date> 2015-06-26, 2015-07-14, 2015-07-16, 2015-07-14, 2015-07-03...
$ pd_cd <dbl> 639, 969, 101, 879, 478, 339, 849, 203, 511, 511, 750, 339,...
$ pd_desc <chr> "AGGRAVATED HARASSMENT 2", "TRAFFIC,UNCLASSIFIED INFRACTION...
$ ky_cd <dbl> 361, 881, 344, 675, 343, 341, 677, 352, 235, 235, 359, 341,...
$ ofns_desc <chr> "OFF. AGNST PUB ORD SENSBLTY & RGHTS TO PRIV", "OTHER TRAFF...
$ law_code <chr> "PL 2403002", "VTL051101A", "PL 1200001", "AC 010125B", "PL...
$ law_cat_cd <chr> "M", "M", "M", "V", "M", "M", "V", "M", "M", "M", "M", "M",...
$ arrest_precinct <dbl> 102, 10, 90, 103, 10, 45, 78, 47, 52, 115, 111, 30, 5, 110,...
$ jurisdiction_code <dbl> 0, 3, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
$ age_group <chr> "45-64", "25-44", "18-24", "25-44", "18-24", "18-24", "18-2...
$ perp_sex <chr> "M", "M", "F", "M", "M", "M", "M", "M", "F", "M", "M", "F",...
$ perp_race <chr> "WHITE HISPANIC", "WHITE HISPANIC", "WHITE HISPANIC", "WHIT...
$ x_coord_cd <dbl> 1031076, 984791, 994026, 1037132, 984602, 1030990, 991330, ...
$ y_coord_cd <dbl> 193779, 209846, 195548, 196129, 210686, 255310, 187303, 262...
$ latitude <dbl> 40.69844, 40.74266, 40.70341, 40.70486, 40.74497, 40.86733,...
$ longitude <dbl> -73.83113, -73.99805, -73.96474, -73.80927, -73.99873, -73....
$ lon_lat <chr> "POINT (-73.83112953899997 40.69843969400005)", "POINT (-73...
$ occur_time <dbl> 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 12, 11, 10, 9, 8,...
$ boro <chr> "QUEENS", "MANHATTAN", "BROOKLYN", "QUEENS", "MANHATTAN", "...
$ precinct <dbl> 102, 10, 90, 103, 10, 45, 78, 47, 52, 115, 111, 30, 5, 110,...
$ statistical_murder_flag <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
$ year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,...
$ month <dbl> 6, 7, 7, 7, 7, 7, 6, 7, 7, 7, 6, 7, 7, 7, 8, 7, 7, 6, 7, 6,...
$ year_mon <yearmon> Jun 2015, Jul 2015, Jul 2015, Jul 2015, Jul 2015, Jul 2...
$ crime_type <fct> other, other, violent, other, theft_property, theft_propert...
$ awnd <dbl> 4.47, 2.68, 5.59, 2.68, 2.91, 4.25, 3.13, 4.47, 4.47, 4.47,...
$ prcp <dbl> 0.00, 0.42, 0.00, 0.42, 0.00, 1.95, 0.00, 0.12, 0.00, 0.12,...
$ snow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ snwd <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ tmax <dbl> 81, 82, 80, 82, 82, 87, 83, 81, 94, 81, 71, 88, 87, 83, 87,...
$ tmin <dbl> 69, 73, 64, 73, 66, 76, 65, 72, 82, 72, 58, 75, 76, 72, 72,...
$ wsf2 <dbl> 16.1, 8.1, 15.0, 8.1, 8.9, 12.1, 13.0, 13.0, 12.1, 13.0, 16...
$ wt01 <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,...
$ wt02 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ wt03 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ wt04 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ wt06 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ wt08 <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,...
$ holiday <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
boro_site <- "https://data.cityofnewyork.us/api/geospatial/tqmj-j8zm?method=export&format=GeoJSON" #CAA internet blocked me from automating this
# load boros
nyboros <- geojsonio::geojson_read("json/Borough Boundaries.geojson", what = "sp")
#prepare color mapping
nyboros$boro_name <- factor(nyboros$boro_name)
factpal <- colorFactor("Set1", nyboros$boro_name)
# build murders in the rain map
leaflet() %>%
addPolygons(data = nyboros, weight = 2, fillColor = ~factpal(boro_name), group = "Boros") %>%
setView(-74,40.7,zoom=10) %>%
addTiles(group = "default") %>%
addMarkers(data = df1 %>% filter(statistical_murder_flag == TRUE & prcp>0.1),~longitude, ~latitude,popup = ~as.character(boro), label = ~as.character(ofns_desc), group = "Shooting Murders in the Rain") %>%
addMarkers(data = df1 %>% filter(statistical_murder_flag == TRUE & prcp==0),~longitude, ~latitude,popup = ~as.character(boro), label = ~as.character(ofns_desc), group = "Shooting Murders in the Clear") %>%
addLayersControl(
baseGroups = c("default"),
overlayGroups = c("Boros", "Shooting Murders in the Rain", "Shooting Murders in the Clear"),
options = layersControlOptions(collapsed = FALSE))
NA
df2 <- df1 %>%
select(occur_date,boro,month,awnd,prcp,snow,tmax,tmin,wsf2,wt01,wt02,wt03,wt04,wt06,wt08,holiday,crime_type) %>%
mutate(
prcp = ifelse(prcp >0.25, 1,0),
snow = ifelse(snow >0.25, 1,0),
awnd = ifelse(awnd>mean(awnd,na.rm = T),1,0), # proxy for windy days
weekday = lubridate::wday(occur_date,label = F, abbr = F)
) %>%
group_by(boro,occur_date, crime_type) %>%
mutate(value = n()) %>%
distinct() %>%
ungroup() %>%
select(-occur_date) %>%
na.omit()
glimpse(df2)
Rows: 28,400
Columns: 18
$ boro <chr> "QUEENS", "MANHATTAN", "BROOKLYN", "QUEENS", "MANHATTAN", "BRONX", "BROO...
$ month <dbl> 6, 7, 7, 7, 7, 7, 6, 7, 7, 7, 6, 7, 7, 7, 8, 7, 7, 6, 7, 6, 7, 8, 6, 7, ...
$ awnd <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...
$ prcp <dbl> 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...
$ snow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ tmax <dbl> 81, 82, 80, 82, 82, 87, 83, 81, 94, 81, 71, 88, 87, 83, 87, 75, 83, 84, ...
$ tmin <dbl> 69, 73, 64, 73, 66, 76, 65, 72, 82, 72, 58, 75, 76, 72, 72, 69, 72, 68, ...
$ wsf2 <dbl> 16.1, 8.1, 15.0, 8.1, 8.9, 12.1, 13.0, 13.0, 12.1, 13.0, 16.1, 13.0, 12....
$ wt01 <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...
$ wt02 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ wt03 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ wt04 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ wt06 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ wt08 <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, ...
$ holiday <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
$ crime_type <fct> other, other, violent, other, theft_property, theft_property, other, oth...
$ weekday <dbl> 6, 3, 5, 3, 6, 5, 5, 4, 2, 4, 7, 3, 5, 2, 4, 7, 2, 4, 4, 7, 6, 7, 1, 1, ...
$ value <int> 72, 110, 121, 71, 111, 72, 122, 57, 95, 90, 71, 124, 96, 48, 157, 52, 68...
Since we are attempting to predict 3 outputs across 5 boros there are a few methods to use when building a regression to predict are 3 target variables.